add level3 defaults for x86#5418
add level3 defaults for x86#5418commodo wants to merge 1 commit intoOpenMathLib:developfrom commodo:add-x86-defaults
Conversation
On some x86 configurations, this fails with (see error below) It seems that there are many x86 configurations supported by OpenBLAS, specific for various CPU family names. But, if (on some x86 builds) this isn't met, then some parameters become undefined. Link: openwrt/packages#27179 (comment) ``` In file included from ../../common.h:586, from gemm3m.c:40: gemm3m_level3.c: In function 'cgemm3m_nn': ../../common_param.h:1435:33: error: 'CGEMM3M_DEFAULT_R' undeclared (first use in this function); did you mean 'CGEMM_DEFAULT_R'? 1435 | #define CGEMM3M_R CGEMM3M_DEFAULT_R | ^~~~~~~~~~~~~~~~~ ../../common_param.h:1671:25: note: in expansion of macro 'CGEMM3M_R' 1671 | #define GEMM3M_R CGEMM3M_R | ^~~~~~~~~ gemm3m_level3.c:306:37: note: in expansion of macro 'GEMM3M_R' 306 | for(js = n_from; js < n_to; js += GEMM3M_R){ | ^~~~~~~~ ../../common_param.h:1435:33: note: each undeclared identifier is reported only once for each function it appears in 1435 | #define CGEMM3M_R CGEMM3M_DEFAULT_R | ^~~~~~~~~~~~~~~~~ ../../common_param.h:1671:25: note: in expansion of macro 'CGEMM3M_R' 1671 | #define GEMM3M_R CGEMM3M_R | ^~~~~~~~~ gemm3m_level3.c:306:37: note: in expansion of macro 'GEMM3M_R' 306 | for(js = n_from; js < n_to; js += GEMM3M_R){ | ^~~~~~~~ ../../common_param.h:1434:33: error: 'CGEMM3M_DEFAULT_Q' undeclared (first use in this function); did you mean 'CGEMM_DEFAULT_Q'? 1434 | #define CGEMM3M_Q CGEMM3M_DEFAULT_Q | ^~~~~~~~~~~~~~~~~ ../../common_param.h:1661:25: note: in expansion of macro 'CGEMM3M_Q' 1661 | #define GEMM3M_Q CGEMM3M_Q | ^~~~~~~~~ gemm3m_level3.c:313:20: note: in expansion of macro 'GEMM3M_Q' 313 | if (min_l >= GEMM3M_Q * 2) { | ^~~~~~~~ i486-openwrt-linux-musl-gcc -Os -pipe -march=pentium-mmx -fno-caller-saves -fno-plt -fhonour-copts -ffile-prefix-map=/builder/build_dir/target-i386_pentium-mmx_musl/OpenBLAS-0.3.30=OpenBLAS-0.3.30 -Wformat -Werror=format-security -fstack-protector -D_FORTIFY_SOURCE=1 -Wl,-z,now -Wl,-z,relro -I/builder/staging_dir/toolchain-i386_pentium-mmx_gcc-14.3.0_musl/usr/include -I/builder/staging_dir/toolchain-i386_pentium-mmx_gcc-14.3.0_musl/include -I/builder/staging_dir/toolchain-i386_pentium-mmx_gcc-14.3.0_musl/include/fortify -DMAX_STACK_ALLOC=2048 -DEXPRECISION -m128bit-long-double -Wall -m32 -DF_INTERFACE_GFORT -fPIC -DC_LAPACK -DNO_LAPACK -DNO_LAPACKE -DNO_AVX -DNO_AVX512 -DSMP_SERVER -DNO_WARMUP -DMAX_CPU_NUMBER=2 -DMAX_PARALLEL_NUMBER=1 -DBUILD_SINGLE=1 -DBUILD_DOUBLE=1 -DBUILD_COMPLEX=1 -DBUILD_COMPLEX16=1 -DVERSION=\"0.3.30\" -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME -DASMNAME= -DASMFNAME=_ -DNAME=_ -DCNAME= -DCHAR_NAME=\"_\" -DCHAR_CNAME=\"\" -DNO_AFFINITY -I. -DMAX_STACK_ALLOC=2048 -DEXPRECISION -m128bit-long-double -Wall -m32 -DF_INTERFACE_GFORT -fPIC -DC_LAPACK -DNO_LAPACK -DNO_LAPACKE -DNO_AVX -DNO_AVX512 -DSMP_SERVER -DNO_WARMUP -DMAX_CPU_NUMBER=2 -DMAX_PARALLEL_NUMBER=1 -DBUILD_SINGLE=1 -DBUILD_DOUBLE=1 -DBUILD_COMPLEX=1 -DBUILD_COMPLEX16=1 -DVERSION=\"0.3.30\" -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME -DASMNAME=cgemm3m_cn -DASMFNAME=cgemm3m_cn_ -DNAME=cgemm3m_cn_ -DCNAME=cgemm3m_cn -DCHAR_NAME=\"cgemm3m_cn_\" -DCHAR_CNAME=\"cgemm3m_cn\" -DNO_AFFINITY -I../.. -UDOUBLE -DCOMPLEX -c -UDOUBLE -DCOMPLEX -DCN gemm3m.c -o cgemm3m_cn.o ../../common_param.h:1433:33: error: 'CGEMM3M_DEFAULT_P' undeclared (first use in this function); did you mean 'CGEMM_DEFAULT_P'? 1433 | #define CGEMM3M_P CGEMM3M_DEFAULT_P | ^~~~~~~~~~~~~~~~~ ../../common_param.h:1651:25: note: in expansion of macro 'CGEMM3M_P' 1651 | #define GEMM3M_P CGEMM3M_P | ^~~~~~~~~ gemm3m_level3.c:325:20: note: in expansion of macro 'GEMM3M_P' 325 | if (min_i >= GEMM3M_P * 2) { | ^~~~~~~~ ../../common_param.h:1436:33: error: 'CGEMM3M_DEFAULT_UNROLL_M' undeclared (first use in this function); did you mean 'CGEMM3M_DEFAULT_UNROLL_N'? 1436 | #define CGEMM3M_UNROLL_M CGEMM3M_DEFAULT_UNROLL_M | ^~~~~~~~~~~~~~~~~~~~~~~~ ../../common_param.h:1580:25: note: in expansion of macro 'CGEMM3M_UNROLL_M' 1580 | #define GEMM3M_UNROLL_M CGEMM3M_UNROLL_M | ^~~~~~~~~~~~~~~~ gemm3m_level3.c:329:33: note: in expansion of macro 'GEMM3M_UNROLL_M' 329 | min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M; | ^~~~~~~~~~~~~~~ make[4]: *** [Makefile:1865: cgemm3m_nn.o] Error 1 make[4]: *** Waiting for unfinished jobs.... In file included from ../../common.h:586, from gemm3m.c:40: gemm3m_level3.c: In function 'cgemm3m_cn': ../../common_param.h:1435:33: error: 'CGEMM3M_DEFAULT_R' undeclared (first use in this function); did you mean 'CGEMM_DEFAULT_R'? 1435 | #define CGEMM3M_R CGEMM3M_DEFAULT_R | ^~~~~~~~~~~~~~~~~ ../../common_param.h:1671:25: note: in expansion of macro 'CGEMM3M_R' 1671 | #define GEMM3M_R CGEMM3M_R | ^~~~~~~~~ gemm3m_level3.c:306:37: note: in expansion of macro 'GEMM3M_R' 306 | for(js = n_from; js < n_to; js += GEMM3M_R){ | ^~~~~~~~ ../../common_param.h:1435:33: note: each undeclared identifier is reported only once for each function it appears in 1435 | #define CGEMM3M_R CGEMM3M_DEFAULT_R | ^~~~~~~~~~~~~~~~~ ../../common_param.h:1671:25: note: in expansion of macro 'CGEMM3M_R' 1671 | #define GEMM3M_R CGEMM3M_R | ^~~~~~~~~ gemm3m_level3.c:306:37: note: in expansion of macro 'GEMM3M_R' 306 | for(js = n_from; js < n_to; js += GEMM3M_R){ | ^~~~~~~~ ../../common_param.h:1434:33: error: 'CGEMM3M_DEFAULT_Q' undeclared (first use in this function); did you mean 'CGEMM_DEFAULT_Q'? 1434 | #define CGEMM3M_Q CGEMM3M_DEFAULT_Q | ^~~~~~~~~~~~~~~~~ ../../common_param.h:1661:25: note: in expansion of macro 'CGEMM3M_Q' 1661 | #define GEMM3M_Q CGEMM3M_Q | ^~~~~~~~~ gemm3m_level3.c:313:20: note: in expansion of macro 'GEMM3M_Q' 313 | if (min_l >= GEMM3M_Q * 2) { | ^~~~~~~~ ```
|
I'll admit here that the defaults may not be correct, so that is up for debathe. This fixes the build for OpenWrt, x86 builds. |
|
How are you building OpenBLAS ? What you saw is what happens when no TARGET option was specified, and autodetection of the cpu model by the |
This change updates the version to 0.3.30 Added patch to fix x86 builds. Also sent upstream: OpenMathLib/OpenBLAS#5418 Signed-off-by: Alexandru Ardelean <alex@shruggie.ro>
|
Do you know which configurations (if any) are failing besides Pentium ? According to kernel/x86/KERNEL.P5 (which includes/redirects to KERNEL.P6), GEMM3M is using 2x2 kernels, which should be reflected in the M and N (where your PR currently has 8 and 4). |
Apologies for the slow reply. |
Short answer is: no. I don't know which other configurations beside Pentium are failing. Once the PR is merged, then it goes into a more completel CI, and will appear after 1-2 days here under one of those build targets.
My understanding is that there is no #define providing any GEMM3M definitions for P5/P6. I haven't looked too deep into this one. |
This change updates the version to 0.3.30 Added patch to fix x86 builds. Also sent upstream: OpenMathLib/OpenBLAS#5418 Signed-off-by: Alexandru Ardelean <alex@shruggie.ro>
|
can you apply #5442 instead please ? |
will do that :) |
seems to work |
Dropped patch: 0001-Make-GEMM3M-parameters-available-on-32bit-X86-GENERI.patch Part of upstream: OpenMathLib/OpenBLAS#5418 Changelog: - Revert a 0.3.30 optimization that could cause race conditions and invalid results in GEMM - Fix thread lockup with Python 3.9 and NumPy - Fix deadlock in multithreaded code after fork() - Add bfloat16 extensions (BGEMM, BGEMV) and basic FP16 infrastructure - Add batch GEMM operations with strided variants - Add multithreaded LAPACK SLAED3/DLAED3 for improved eigensolvers - Add Apple M4 and Intel Lunar Lake support - Add initial POWER11 architecture support - Improve GEMM performance on A64FX and ARM processors Full release notes: https://github.com/OpenMathLib/OpenBLAS/releases/tag/v0.3.31 Signed-off-by: Alexandru Ardelean <alex@shruggie.ro>
Dropped patch: 0001-Make-GEMM3M-parameters-available-on-32bit-X86-GENERI.patch Part of upstream: OpenMathLib/OpenBLAS#5418 Changelog: - Revert a 0.3.30 optimization that could cause race conditions and invalid results in GEMM - Fix thread lockup with Python 3.9 and NumPy - Fix deadlock in multithreaded code after fork() - Add bfloat16 extensions (BGEMM, BGEMV) and basic FP16 infrastructure - Add batch GEMM operations with strided variants - Add multithreaded LAPACK SLAED3/DLAED3 for improved eigensolvers - Add Apple M4 and Intel Lunar Lake support - Add initial POWER11 architecture support - Improve GEMM performance on A64FX and ARM processors Full release notes: https://github.com/OpenMathLib/OpenBLAS/releases/tag/v0.3.31 Signed-off-by: Alexandru Ardelean <alex@shruggie.ro>
Dropped patch: 0001-Make-GEMM3M-parameters-available-on-32bit-X86-GENERI.patch Part of upstream: OpenMathLib/OpenBLAS#5418 Changelog: - Revert a 0.3.30 optimization that could cause race conditions and invalid results in GEMM - Fix thread lockup with Python 3.9 and NumPy - Fix deadlock in multithreaded code after fork() - Add bfloat16 extensions (BGEMM, BGEMV) and basic FP16 infrastructure - Add batch GEMM operations with strided variants - Add multithreaded LAPACK SLAED3/DLAED3 for improved eigensolvers - Add Apple M4 and Intel Lunar Lake support - Add initial POWER11 architecture support - Improve GEMM performance on A64FX and ARM processors Full release notes: https://github.com/OpenMathLib/OpenBLAS/releases/tag/v0.3.31 Signed-off-by: Alexandru Ardelean <alex@shruggie.ro>
On some x86 configurations, this fails with (see error below)
It seems that there are many x86 configurations supported by OpenBLAS, specific for various CPU family names.
But, if (on some x86 builds) this isn't met, then some parameters become undefined.
Link:
openwrt/packages#27179 (comment)